import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 加载数据
data = pd.read_csv("data.csv")

# 数据预处理，选择特征
features = ['tag', 'creator', 'contributor', 'title', 'year', 'month', 'bossStatus', 'hot_score']
X = data[features].fillna(data['hot_score'].mean())
y = data['score_per'].fillna(data['score_per'].mean())

# 对类别特征进行独热编码
X = pd.get_dummies(X)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建随机森林回归模型
model = RandomForestRegressor(n_estimators=100, random_state=42)

# 训练模型
model.fit(X_train, y_train)
print(type(model), "模型")

# 预测
print(X_test)
y_pred = model.predict(X_test)
print(y_pred)


sample = {'tag': '都市;爱情;剧情;生活;当代;内地',
          'creator': '',
          'contributor': "江疏影, 杨采钰, 张佳宁, 张慧雯, 李浩菲, 窦骁, 王安宇, 经超",
          'title': '抗日神剧',
          'year': 2024,
          'month': 3,
          'bossStatus': 'FREE',
          'hot_score': 6008}

# 将单个样本转换为DataFrame
sample_df = pd.DataFrame([sample])

# 对特征进行预处理，例如独热编码
sample_df = pd.get_dummies(sample_df)

print(sample_df)
print(model.predict(sample_df), "结果")

# 计算均方误差
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
